library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DT)
neon_jgi_pilot_ind_bins <- read_csv("neon-jgi_pilot_ind_assembly_bins.csv") %>% 
# out taxa categories in separate columns
# IMG only reports to the species level
  rename(`Completeness` = `Bin Completeness`) %>% 
  rename(`Contamination` = `Bin Contamination`) %>% 
  rename(`Site` = `Genome Name`) %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>% 
#  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>% 
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ") %>% 
# Simplify Site name
  mutate_at("Site", str_replace, "Soil microbial communities from ", "") %>% 
  separate(`Site`, c("Site","Sample Name"), " - ") %>% 
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-",) 
## Rows: 1130 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 21 rows [92, 131, 132, 228,
## 252, 575, 576, 678, 723, 730, 824, 825, 826, 827, 828, 891, 925, 946, 981, 996,
## ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 282 rows [39, 40, 41, 57,
## 59, 64, 65, 66, 93, 94, 133, 145, 147, 148, 149, 172, 176, 181, 186, 187, ...].
write_tsv(neon_jgi_pilot_ind_bins, "neon_jgi_pilot_ind_bins.tsv")

Tables

Table of Phylum Counts

datatable(neon_jgi_pilot_ind_bins %>% 
  group_by(Phylum) %>% 
  summarise(n = n()) %>%
  mutate(freq = 100 * n / sum(n)) %>% 
  mutate_if(is.numeric, round, 1)  
)

Table of Bin Count for Each Metagenome

datatable(neon_jgi_pilot_ind_bins %>% 
  group_by(`IMG Genome ID`, Site) %>% 
  summarise(n = n()) 
)
## `summarise()` has grouped output by 'IMG Genome ID'. You can override using the
## `.groups` argument.

Table of Bin Count for Site

datatable(neon_jgi_pilot_ind_bins %>% 
  group_by(Site) %>% 
  summarise(n = n()) 
)

Graphs

Phyla bar chart

Phyla bar chart

#### Class bar chart

Phylum Count Bar chart by site

#### Class Count Bar chart by site

#### Site Count Bar chart by phylum

#### Site Count Bar chart by Class

Site Count Bar chart by Actinobacteriota

Histogram of genome sizes

neon_jgi_pilot_ind_bins %>% 
ggplot(aes(x = `Total Number of Bases`)) + 
  geom_histogram(colour = "black", fill = "maroon", binwidth=500000) +
  ggtitle("Genome size of MAGs") +
  ylab("Genome size") + 
  theme(text = element_text(size = 20, color="black"))

#  theme(axis.text.x = element_text(angle = 90))